# ===============================================
# Robust Prismatic Recursion Benchmark (Auto GPU Detect + φ-weighted)
# ===============================================
import pyopencl as cl
import numpy as np
import time
import itertools

# ---------------------------
# Auto-detect GPU/CPU device
# ---------------------------
device = None
for platform in cl.get_platforms():
    for d in platform.get_devices():
        if d.type & cl.device_type.GPU:
            device = d
            break
    if device:
        break

if device is None:
    print("[WARN] No GPU found via OpenCL. Falling back to CPU.")
    for platform in cl.get_platforms():
        for d in platform.get_devices():
            if d.type & cl.device_type.CPU:
                device = d
                break
        if device:
            break
    if device is None:
        raise RuntimeError("No OpenCL devices found on this system.")

ctx = cl.Context([device])
queue = cl.CommandQueue(ctx)

print("Using device:", device.name)
print("Global Memory (MB):", device.global_mem_size // 1024**2)
print("Compute Units:", device.max_compute_units)
print("Max Clock (MHz):", device.max_clock_frequency)

# ---------------------------
# Recursive expansion model
# ---------------------------
def expansion(depth):
    return 8**depth

def total_instances(depth):
    base_instances = 131072  # safe VRAM seeds
    return expansion(depth) * base_instances

# ---------------------------
# Kernel (simulate recursion FLOPs)
# ---------------------------
kernel_code = """
__kernel void recurse(
    __global float *data,
    const int expansion)
{
    int gid = get_global_id(0);
    float x = data[gid];
    for(int i=0; i<expansion; i++){
        x = sqrt(x * 1.618f + 0.5f) * 1.0001f;
    }
    data[gid] = x;
}
"""
program = cl.Program(ctx, kernel_code).build()

# ---------------------------
# Benchmark Loop
# ---------------------------
phi = 1.6180339887
depths = range(1, 11)
results = []

for depth in depths:
    N = min(total_instances(depth), 2**20)  # safe clamp to avoid OOM
    expansion_factor = expansion(depth)

    data = np.random.rand(N).astype(np.float32)
    buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=data)

    # Warmup
    evt = program.recurse(queue, (N,), None, buf, np.int32(expansion_factor))
    evt.wait()

    # Timed run
    t0 = time.time()
    for _ in range(5):
        evt = program.recurse(queue, (N,), None, buf, np.int32(expansion_factor))
    evt.wait()
    dt = (time.time() - t0) / 5.0

    fps = 1.0 / dt
    vram_mb = data.nbytes / 1024**2
    flops = (N * expansion_factor) / dt / 1e9
    weighted_flops = flops * (phi ** ((depth-1)/4))  # φ-weighted GFLOPs

    results.append((depth, fps, flops, weighted_flops))
    print(f"Depth {depth:2d} | N={N:,} | VRAM={vram_mb:.1f} MB | {fps:.2f} FPS | {flops:.2f} GFLOPs | Weighted={weighted_flops:.2f}")

# ---------------------------
# Find top combos by weighted throughput
# ---------------------------
weighted_results = sorted(results, key=lambda x: x[3], reverse=True)
print("\nTop individual depths by φ-weighted GFLOPs:")
for r in weighted_results[:5]:
    print(f"Depth {r[0]:2d} | GFLOPs={r[2]:.2f} | Weighted={r[3]:.2f}")

best_2 = max(itertools.combinations(weighted_results, 2), key=lambda x: x[0][3]+x[1][3])
best_3 = max(itertools.combinations(weighted_results, 3), key=lambda x: sum([y[3] for y in x]))

print(f"\nBest combo of size 2: {[x[0] for x in best_2]} -> {sum([x[3] for x in best_2]):.2f} weighted GFLOPs")
print(f"Best combo of size 3: {[x[0] for x in best_3]} -> {sum([x[3] for x in best_3]):.2f} weighted GFLOPs")
